import numpy as np
import torch
from sklearn.datasets import fetch_california_housing
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from typing import Any
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
cuda
california_housing = fetch_california_housing(as_frame=True).frame
california_housing
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | MedHouseVal | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20635 | 1.5603 | 25.0 | 5.045455 | 1.133333 | 845.0 | 2.560606 | 39.48 | -121.09 | 0.781 |
| 20636 | 2.5568 | 18.0 | 6.114035 | 1.315789 | 356.0 | 3.122807 | 39.49 | -121.21 | 0.771 |
| 20637 | 1.7000 | 17.0 | 5.205543 | 1.120092 | 1007.0 | 2.325635 | 39.43 | -121.22 | 0.923 |
| 20638 | 1.8672 | 18.0 | 5.329513 | 1.171920 | 741.0 | 2.123209 | 39.43 | -121.32 | 0.847 |
| 20639 | 2.3886 | 16.0 | 5.254717 | 1.162264 | 1387.0 | 2.616981 | 39.37 | -121.24 | 0.894 |
20640 rows × 9 columns
data = (
california_housing
[["MedInc", "MedHouseVal"]]
.assign(split=lambda df: np.random.RandomState(42).choice(["train", "test"], p=[.8, .2], size=df.shape[0]))
.rename(columns={"MedInc": "X_orig", "MedHouseVal": "y_orig"})
.assign(
X=lambda df: (df["X_orig"] - df["X_orig"].mean()) / (df["X_orig"].std() + 1e-6),
y=lambda df: (df["y_orig"] - df["y_orig"].mean()) / (df["y_orig"].std() + 1e-6)
)
)
data
| X_orig | y_orig | split | X | y | |
|---|---|---|---|---|---|
| 0 | 8.3252 | 4.526 | train | 2.344708 | 2.129578 |
| 1 | 8.3014 | 3.585 | test | 2.332180 | 1.314123 |
| 2 | 7.2574 | 3.521 | train | 1.782655 | 1.258662 |
| 3 | 5.6431 | 3.413 | train | 0.932944 | 1.165071 |
| 4 | 3.8462 | 3.422 | train | -0.012881 | 1.172870 |
| ... | ... | ... | ... | ... | ... |
| 20635 | 1.5603 | 0.781 | train | -1.216098 | -1.115776 |
| 20636 | 2.5568 | 0.771 | test | -0.691576 | -1.124442 |
| 20637 | 1.7000 | 0.923 | train | -1.142565 | -0.992722 |
| 20638 | 1.8672 | 0.847 | test | -1.054557 | -1.058582 |
| 20639 | 2.3886 | 0.894 | train | -0.780110 | -1.017852 |
20640 rows × 5 columns
X_train_orig_numpy = data[data["split"] == "train"]["X_orig"].to_numpy()
y_train_orig_numpy = data[data["split"] == "train"]["y_orig"].to_numpy()
X_train_numpy = data[data["split"] == "train"]["X"].to_numpy()
y_train_numpy = data[data["split"] == "train"]["y"].to_numpy()
X_test_numpy = data[data["split"] == "test"]["X"].to_numpy()
y_test_numpy = data[data["split"] == "test"]["y"].to_numpy()
X_test_orig_numpy = data[data["split"] == "test"]["X_orig"].to_numpy()
y_test_orig_numpy = data[data["split"] == "test"]["y_orig"].to_numpy()
X_train = torch.tensor(X_train_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
y_train = torch.tensor(y_train_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
X_test = torch.tensor(X_test_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
y_test = torch.tensor(y_test_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
def viz_data():
fig, (orig_ax, normalised_ax) = plt.subplots(nrows=2, figsize=(8, 8))
orig_ax.hist(X_train_orig_numpy, bins=20, label="train", alpha=.5, density=True)
orig_ax.hist(X_test_orig_numpy, bins=20, label="test", alpha=.5, density=True)
orig_ax.set_xlabel("X_<split>_orig_numpy")
orig_ax.legend()
normalised_ax.hist(X_train_numpy, bins=20, label="train", alpha=.5, density=True)
normalised_ax.hist(X_test_numpy, bins=20, label="test", alpha=.5, density=True)
normalised_ax.set_xlabel("X_<split>_numpy")
normalised_ax.legend()
viz_data()
class MCDropout(torch.nn.Module):
def __init__(self, p: float) -> None:
self.p = p
super().__init__()
def forward(self, input: torch.Tensor) -> torch.Tensor:
return torch.nn.functional.dropout(input, p=self.p, training=True)
def create_net(net_description: dict[str, Any]) -> torch.nn.Module:
def get_activation():
act = net_description.get("activation", "gelu")
if act == "gelu":
return torch.nn.GELU()
raise NotImplementedError(f"Unknown activation: {act}.")
def get_dropout() -> list[torch.nn.Module]:
dropout_p = net_description.get("dropout_p", .5)
dropout_type = net_description.get("dropout_type", .5)
if dropout_type == "regular":
return [torch.nn.Dropout(p=dropout_p)]
if dropout_type == "monte_carlo":
return [MCDropout(p=dropout_p)]
if dropout_type == "None":
return []
raise NotImplementedError(f"Unknown dropout: {dropout_type}.")
num_hidden_neurons = net_description.get("num_hidden_neurons", 50)
return torch.nn.Sequential(
torch.nn.Linear(in_features=1, out_features=num_hidden_neurons, bias=True),
get_activation(),
# get_dropout(),
torch.nn.Linear(in_features=num_hidden_neurons, out_features=num_hidden_neurons, bias=True),
get_activation(),
*get_dropout(),
torch.nn.Linear(in_features=num_hidden_neurons, out_features=num_hidden_neurons, bias=True),
get_activation(),
*get_dropout(),
torch.nn.Linear(in_features=num_hidden_neurons, out_features=1, bias=True),
torch.nn.Sigmoid(),
torch.nn.Linear(in_features=1, out_features=1, bias=True),
).to(device=device)
def train_net(
net_description: dict[str, Any],
random_seed: int,
num_epochs: int,
batch_size: int,
learning_rate: float,
gamma: float,
model_debug_data_save_frequency: int = 10
):
torch.manual_seed(random_seed)
net = create_net(net_description=net_description)
optim = torch.optim.AdamW(params=net.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=gamma)
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
metrics = []
for i_epoch in range(num_epochs):
model_debug_data = None
verbose = i_epoch % 50 == 0
net.train()
train_losses = []
for batch_index, (input, target) in enumerate(train_loader):
net.zero_grad()
output = net.forward(input)
loss = torch.nn.functional.mse_loss(output, target)
loss.backward()
optim.step()
if batch_index == 0 and i_epoch % model_debug_data_save_frequency == 0:
model_debug_data = {
"gradients": {
name: param.grad.detach().cpu().numpy()
for name, param in net.named_parameters()
},
"weights": {
name: param.detach().cpu().numpy()
for name, param in net.named_parameters()
}
}
train_losses.append(loss.item())
current_learning_rate = optim.param_groups[0]["lr"]
scheduler.step()
train_loss = np.array(train_losses).mean()
if verbose:
print(f"[Epoch: {i_epoch}] Train loss: {train_loss}")
net.eval()
with torch.no_grad():
test_losses = []
for input, target in test_loader:
output = net.forward(input)
loss = torch.nn.functional.mse_loss(output, target)
test_losses.append(loss.item())
test_loss = np.array(test_losses).mean()
if verbose:
print(f"[Epoch: {i_epoch}] Test loss: {test_loss}")
metrics.append(
{
"epoch": i_epoch,
"train_loss": train_loss,
"test_loss": test_loss,
"model_debug_data": model_debug_data,
"current_learning_rate": current_learning_rate
}
)
return {
"metrics": pd.DataFrame(metrics),
"net": net,
}
def viz_training(training_bundle, num_pred: int = 1):
net: torch.nn.Module = training_bundle["net"]
metrics_df: pd.DataFrame = training_bundle["metrics"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 20))
loss_ax, pred_vs_true_ax, gradient_ax, weight_ax = axes.ravel()
loss_ax.plot(metrics_df["epoch"], metrics_df["train_loss"], label="train_loss")
loss_ax.plot(metrics_df["epoch"], metrics_df["test_loss"], label="test_loss")
loss_ax.set_xlabel("Epoch")
loss_ax.set_ylabel("Loss")
loss_ax.legend()
pred_vs_true_ax.scatter(X_train_numpy.ravel(), y_train_numpy, label="train true", alpha=.1)
pred_vs_true_ax.scatter(X_test_numpy.ravel(), y_test_numpy, label="test true", alpha=.1)
net.eval()
with torch.no_grad():
preds_train_x = []
preds_train_y = []
preds_test_x = []
preds_test_y = []
for _ in range(num_pred):
preds_train_x.append(X_train_numpy.ravel())
preds_train_y.append(net.forward(X_train).cpu().numpy().ravel())
preds_test_x.append(X_test_numpy.ravel())
preds_test_y.append(net.forward(X_test).cpu().numpy().ravel())
pred_vs_true_ax.scatter(np.concatenate(preds_train_x), np.concatenate(preds_train_y), label="train pred", alpha=.5)
pred_vs_true_ax.scatter(np.concatenate(preds_test_x), np.concatenate(preds_test_y), label="test pred", alpha=.5)
for source, ax in zip(["gradients", "weights"], [gradient_ax, weight_ax]):
gradient_df = (
training_bundle["metrics"]
.loc[lambda df: df["model_debug_data"].notna()]
.assign(
temp=lambda df: [
{"epoch": epoch} | model_debug_data[source]
for epoch, model_debug_data
in zip(df["epoch"], df["model_debug_data"])
]
)
["temp"].apply(pd.Series)
.set_index("epoch")
.assign(
**{
colname: (
lambda df, colname=colname: [
np.power(vec.ravel(), 2.).mean()
for vec in df[colname]
]
)
for colname in list(training_bundle["metrics"]["model_debug_data"].iloc[0][source].keys())
}
)
)
for col in gradient_df.columns:
ax.plot(gradient_df.index.to_numpy(), gradient_df[col].to_numpy(), label=col)
ax.set_title(source)
ax.legend()
pred_vs_true_ax.legend()
training_bundle_default = train_net(
net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "regular"},
random_seed=42,
num_epochs=300,
batch_size=1000,
learning_rate=1e-2,
gamma=.99,
model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_default)
[Epoch: 0] Train loss: 1.4952805953867294 [Epoch: 0] Test loss: 1.1796560525894164 [Epoch: 50] Train loss: 0.5256583147189197 [Epoch: 50] Test loss: 0.4941104888916016 [Epoch: 100] Train loss: 0.5212734376682955 [Epoch: 100] Test loss: 0.48921377062797544 [Epoch: 150] Train loss: 0.5190125949242536 [Epoch: 150] Test loss: 0.48906358480453493 [Epoch: 200] Train loss: 0.516857880003312 [Epoch: 200] Test loss: 0.48817660808563235 [Epoch: 250] Train loss: 0.5187382540282082 [Epoch: 250] Test loss: 0.4878509402275085
training_bundle_other_seed = train_net(
net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "regular"},
random_seed=43,
num_epochs=300,
batch_size=1000,
learning_rate=1e-2,
gamma=.99,
model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_other_seed)
[Epoch: 0] Train loss: 0.763431563096888 [Epoch: 0] Test loss: 0.6667608618736267 [Epoch: 50] Train loss: 0.5220497355741613 [Epoch: 50] Test loss: 0.4903694033622742 [Epoch: 100] Train loss: 0.5193892997853896 [Epoch: 100] Test loss: 0.4885431408882141 [Epoch: 150] Train loss: 0.51900175038506 [Epoch: 150] Test loss: 0.48777252435684204 [Epoch: 200] Train loss: 0.5191797473851372 [Epoch: 200] Test loss: 0.4880248963832855 [Epoch: 250] Train loss: 0.5179159693858203 [Epoch: 250] Test loss: 0.48744412064552306
training_bundle_no_dropout = train_net(
net_description={"activation": "gelu", "dropout_type": "None"},
random_seed=43,
num_epochs=300,
batch_size=1000,
learning_rate=1e-2,
gamma=.99,
model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_no_dropout)
[Epoch: 0] Train loss: 0.7599048018455505 [Epoch: 0] Test loss: 0.665152621269226 [Epoch: 50] Train loss: 0.5133729752372292 [Epoch: 50] Test loss: 0.48799843788146974 [Epoch: 100] Train loss: 0.5112034678459167 [Epoch: 100] Test loss: 0.48818291425704957 [Epoch: 150] Train loss: 0.5117473076371586 [Epoch: 150] Test loss: 0.4866418719291687 [Epoch: 200] Train loss: 0.5127150083289427 [Epoch: 200] Test loss: 0.48709317445755007 [Epoch: 250] Train loss: 0.5107313131584841 [Epoch: 250] Test loss: 0.4867681682109833
training_bundle_mc = train_net(
net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "monte_carlo"},
random_seed=42,
num_epochs=300,
batch_size=1000,
learning_rate=1e-2,
gamma=.99,
model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_mc)
[Epoch: 0] Train loss: 1.4952805953867294 [Epoch: 0] Test loss: 1.1811462044715881 [Epoch: 50] Train loss: 0.5257831724251018 [Epoch: 50] Test loss: 0.5057389855384826 [Epoch: 100] Train loss: 0.5239530679057626 [Epoch: 100] Test loss: 0.49976211190223696 [Epoch: 150] Train loss: 0.5209505820975584 [Epoch: 150] Test loss: 0.4927620947360992 [Epoch: 200] Train loss: 0.518083945793264 [Epoch: 200] Test loss: 0.4936288416385651 [Epoch: 250] Train loss: 0.5198071038021761 [Epoch: 250] Test loss: 0.4971176326274872
viz_training(training_bundle=training_bundle_mc, num_pred=10)
training_bundle_mc_other_seed = train_net(
net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "monte_carlo"},
random_seed=43,
num_epochs=300,
batch_size=1000,
learning_rate=1e-2,
gamma=.99,
model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_mc_other_seed)
[Epoch: 0] Train loss: 0.763431563096888 [Epoch: 0] Test loss: 0.6650225281715393 [Epoch: 50] Train loss: 0.522653192281723 [Epoch: 50] Test loss: 0.4953924059867859 [Epoch: 100] Train loss: 0.5203518446754006 [Epoch: 100] Test loss: 0.49757230281829834 [Epoch: 150] Train loss: 0.5198933622416329 [Epoch: 150] Test loss: 0.4940872311592102 [Epoch: 200] Train loss: 0.5204210474210627 [Epoch: 200] Test loss: 0.490051406621933 [Epoch: 250] Train loss: 0.5169882458799026 [Epoch: 250] Test loss: 0.4932470917701721